{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tokenizer\n",
    "\n",
    "## 基本用法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
    "\n",
    "sentences = [\n",
    "    'I love my dog',\n",
    "    'I love my cat'\n",
    "]\n",
    "\n",
    "# num_words = 100代表took top 100 words by columns，因为出现频率少的单词对Accuracy影响少，但延长了训练时间\n",
    "tokenizer = Tokenizer(num_words = 100)\n",
    "# fit的过程中，会把所有大小变成小写，并去掉标点符号\n",
    "tokenizer.fit_on_texts(sentences)\n",
    "word_index = tokenizer.word_index\n",
    "word_index\n",
    "# 编码顺序按出现频率和出现顺序排序"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[1, 2, 3, 4], [1, 2, 3, 5]]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 把句子转成index数组\n",
    "sequences = tokenizer.texts_to_sequences(sentences)\n",
    "sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['i love my dog', 'i love my cat']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 把index转成句子\n",
    "sentences = tokenizer.sequences_to_texts(sequences)\n",
    "sentences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'dog' in word_index"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# num_words\n",
    "\n",
    "对全部单词编码，但只对前num_words个单词转码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[1, 2, 3, 4], [1, 2, 3, 5]]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
    "\n",
    "sentences = [\n",
    "    'I love my dog',\n",
    "    'I love my cat'\n",
    "]\n",
    "\n",
    "# num_words = 100代表took top 100 words by columns，因为出现频率少的单词对Accuracy影响少，但延长了训练时间\n",
    "tokenizer = Tokenizer(num_words = 8)\n",
    "# fit的过程中，会把所有大小变成小写，并去掉标点符号\n",
    "tokenizer.fit_on_texts(sentences)\n",
    "word_index = tokenizer.word_index\n",
    "word_index\n",
    "# 编码顺序按出现频率和出现顺序排序\n",
    "sequences = tokenizer.texts_to_sequences(sentences)\n",
    "sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "min(tokenizer.num_words, len(tokenizer.word_index))\n",
    "#tokenizer.num_words\n",
    "#len(tokenizer.word_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word_index['dog']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## oov"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 默认情况下，tokenizer会把不认识的单词跳过（ignore）\n",
    "# 可以把不认识的单词设置为OOV\n",
    "tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 序列化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\"class_name\": \"Tokenizer\", \"config\": {\"num_words\": 100, \"filters\": \"!\\\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n\", \"lower\": true, \"split\": \" \", \"char_level\": false, \"oov_token\": null, \"document_count\": 2, \"word_counts\": \"{\\\"i\\\": 2, \\\"love\\\": 2, \\\"my\\\": 2, \\\"dog\\\": 1, \\\"cat\\\": 1}\", \"word_docs\": \"{\\\"my\\\": 2, \\\"dog\\\": 1, \\\"love\\\": 2, \\\"i\\\": 2, \\\"cat\\\": 1}\", \"index_docs\": \"{\\\"3\\\": 2, \\\"4\\\": 1, \\\"2\\\": 2, \\\"1\\\": 2, \\\"5\\\": 1}\", \"index_word\": \"{\\\"1\\\": \\\"i\\\", \\\"2\\\": \\\"love\\\", \\\"3\\\": \\\"my\\\", \\\"4\\\": \\\"dog\\\", \\\"5\\\": \\\"cat\\\"}\", \"word_index\": \"{\\\"i\\\": 1, \\\"love\\\": 2, \\\"my\\\": 3, \\\"dog\\\": 4, \\\"cat\\\": 5}\"}}\n"
     ]
    }
   ],
   "source": [
    "# tokenizer存在json文件中\n",
    "\n",
    "json_token = tokenizer.to_json()\n",
    "with open('file\\\\tokenizer.json', 'w') as f:\n",
    "    print(json_token)\n",
    "    f.write(json_token)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<keras_preprocessing.text.Tokenizer at 0x1e38a93b780>"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 从json文件中读tokenizer\n",
    "from tensorflow.keras.preprocessing.text import tokenizer_from_json\n",
    "\n",
    "with open('file\\\\tokenizer.json', 'r') as f:\n",
    "    json_string = f.read()\n",
    "    tokenizer = tokenizer_from_json(json_string)\n",
    "\n",
    "tokenizer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# padding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, 1],\n",
       "       [0, 0, 1, 2],\n",
       "       [0, 1, 2, 3],\n",
       "       [1, 2, 3, 4]])"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "\n",
    "sequences = [\n",
    "    [1],\n",
    "    [1,2],\n",
    "    [1,2,3],\n",
    "    [1,2,3,4]\n",
    "]\n",
    "padded = pad_sequences(sequences)\n",
    "# 句子长度不同时，所有句子都padding到最长句子的长度。默认的padding方式是在前面填0\n",
    "padded"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1, 0, 0, 0],\n",
       "       [1, 2, 0, 0],\n",
       "       [1, 2, 3, 0],\n",
       "       [1, 2, 3, 4]])"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 改为在句子尾部填0\n",
    "sequences = [\n",
    "    [1],\n",
    "    [1,2],\n",
    "    [1,2,3],\n",
    "    [1,2,3,4]\n",
    "]\n",
    "padded = pad_sequences(sequences, padding='post')\n",
    "padded"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# trunicate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 1],\n",
       "       [1, 2],\n",
       "       [2, 3],\n",
       "       [3, 4]])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences = [\n",
    "    [1],\n",
    "    [1,2],\n",
    "    [1,2,3],\n",
    "    [1,2,3,4]\n",
    "]\n",
    "# 设置句子的最大长度，不足则填充，超长则截断\n",
    "padded = pad_sequences(sequences, maxlen=2)\n",
    "# 默认的截断方式是保留最后的同几个字符，将前面的截掉\n",
    "padded"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 1],\n",
       "       [1, 2],\n",
       "       [1, 2],\n",
       "       [1, 2]])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences = [\n",
    "    [1],\n",
    "    [1,2],\n",
    "    [1,2,3],\n",
    "    [1,2,3,4]\n",
    "]\n",
    "# 设置为保留开关的字符，把后面的截掉\n",
    "padded = pad_sequences(sequences, maxlen=2, truncating='post')\n",
    "# 默认的截断方式是保留最后的同几个字符，将前面的截掉\n",
    "padded"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 图像增强"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensorflow.keras.preprocessing.image import ImageDataGenerator\n",
    "# from pyimagesearch.minivggnet import MiniVGGNet\n",
    "import tensorflow.keras as keras\n",
    "\n",
    "aug = ImageDataGenerator(rotation_range=20, zoom_range=0.15, width_shift_range=0.2, height_shift_range=0.2,\n",
    "                         shear_range=0.15, horizontal_flip=True, fill_mode='nearest')\n",
    "\n",
    "history = model.fit_generator(aug.flow(trainX, trainY, batch_size=BS),\n",
    "                             validation_data=(testX, testY),\n",
    "                             steps_per_epoch=len(trainX)//BS,\n",
    "                             epochs = EPOCHS)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
